Meeting 2/19/24

knitr::opts_knit$set(root.dir = here::here())

library(tidyverse)
library(scater)
library(scran)
library(scuttle)
library(clusterProfiler)
library(org.Mm.eg.db)
library(tidytext)
library(slingshot)

theme_set(theme_classic())
read_rds("~/work/JainLabSingleCell23_downstream_v2/results/single-cell-preproc/preprocessed/adult_9646_combined.cellranger.g_mito_cuts.rds") + labs(title="9646")

read_rds("~/work/JainLabSingleCell23_downstream_v2/results/single-cell-preproc/preprocessed/adult_9682_combined.cellranger.g_mito_cuts.rds") + labs(title="9682")

Post-integration overview

sce <- read_rds("~/work/JainLabSingleCell23_downstream_v2/results/single-cell-preproc/integrated/adult.cellranger.sce.integrated.rds")
plotUMAP(sce,colour_by="celltype",text_by="celltype")

plotUMAP(sce,colour_by="label2",text_by="label2")

makePerCellDF(sce) |>
  ggplot(aes(batch,fill=genotype)) +
  geom_bar(position="dodge") +
  scale_fill_grey()

makePerCellDF(sce) |>
  ggplot(aes(celltype,fill=genotype)) +
  geom_bar(position="dodge") +
  scale_fill_grey() +
  facet_wrap(~batch,ncol=1) +
  theme(axis.text.x = element_text(angle=45, hjust=1))

makePerCellDF(sce) |>
  ggplot(aes(celltype,nexprs,fill=genotype)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 45, hjust=1))

Germ cell trajectory

germ_sce <- read_rds("~/work/JainLabSingleCell23_downstream_v2/results/trajectory/adult.cellranger.sce.germ_cell.trajectory.rds")

germ_sce_exportable <- germ_sce
germ_sce_exportable$slingshot <- NULL

embedded <- embedCurves(germ_sce, "UMAP")
embedded <- slingCurves(embedded)[[1]] # only 1 path.
embedded <- data.frame(embedded$s[embedded$ord,])

get_ps_umap <- \(x,lab,text=NULL) plotUMAP(x, colour_by=lab, text_by=text) +
  geom_path(data=embedded[seq(nrow(embedded)-300),], aes(x=Dim.1, y=Dim.2), size=1.2, arrow = arrow(length=unit(0.30,"cm"), ends="last", type = "closed")) +
  labs(title=unique(x$genotype))
get_ps_umap(germ_sce[,germ_sce$genotype==c("WT")],"celltype") +
get_ps_umap(germ_sce[,germ_sce$genotype==c("MUT")],"celltype")
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

get_ps_umap(germ_sce[,germ_sce$genotype==c("WT")],"label2","label2") +
get_ps_umap(germ_sce[,germ_sce$genotype==c("MUT")],"label2","label2")

TRADE-seq

trade_res <- read_rds("results/tradeseq/adult.cellranger.sce.germ_cell.tradeseq.rds")
trade_res$slingshot <- NULL
library(tradeSeq)
gtade <- tradeSeq::plotSmoothers(trade_res,
                                 counts=logcounts(trade_res),
                                 gene="L1MdA_I_5end",
                        lwd=1, 
                        nPoints=10)
# --------------- dysreg in adjult vs juvenile

adult_dysreg <- names(Biostrings::readDNAStringSet("results/tradeseq/adult.cellranger.germ_cell.dysregulated_tes.fasta"))

juv_dysreg <- names(Biostrings::readDNAStringSet("results/tradeseq/juvenile_13d_wt_null.cellranger.germ_cell.dysregulated_tes.fasta"))

dysreg <- list(adult=adult_dysreg, juvenile=juv_dysreg) |>
  enframe() |>
  unnest(value) |>
  mutate(dysregulated=T) |>
  pivot_wider(names_from = name, values_from = dysregulated,values_fill = F)

filter(dysreg,str_detect(value,"^L1")) |>
  gt::gt()
value adult juvenile
L1MdV_III_3end TRUE FALSE
L1MdF_III_3end TRUE FALSE
L1MdF_IV_3end TRUE FALSE
L1MdMus_I_5end TRUE FALSE
L1MdTf_II_3end TRUE FALSE
L1MdA_IV_3end TRUE FALSE
L1MdA_I_5end FALSE TRUE
L1MdA_II_5end FALSE TRUE
L1MdA_I_3end FALSE TRUE
L1MdTf_III_5end FALSE TRUE
L1MdF_I_3end FALSE TRUE
L1MdA_I_orf2 FALSE TRUE
L1Lx_I_3end FALSE TRUE
L1MdF_V_3end FALSE TRUE
L1MdA_IV_orf2 FALSE TRUE
L1MdTf_III_orf2 FALSE TRUE
L1MdMus_II_orf2 FALSE TRUE
filter(dysreg,!str_detect(value,"^L1")) |>
  gt::gt()
value adult juvenile
MMERVK10C TRUE TRUE
RLTR10C TRUE TRUE
B2_Mm2 TRUE TRUE
B1_Mus1 TRUE FALSE
B1_Mus2 TRUE FALSE
RLTR6-int TRUE FALSE
B1_Mm TRUE FALSE
IAPEz-int TRUE FALSE
B2_Mm1a TRUE FALSE
MMVL30-int TRUE TRUE
RLTR44-int TRUE FALSE
IAPLTR2_Mm FALSE TRUE
IAPLTR2a FALSE TRUE
B2_Mm1t FALSE TRUE
intersect(adult_dysreg, juv_dysreg)
[1] "MMERVK10C"  "RLTR10C"    "B2_Mm2"     "MMVL30-int"

Differential expression

plot_ma <- function(obj,lab=NULL) {
    mutate(obj,type = if_else(str_detect(feature,"ENSMUSG"),"gene","TE")) |>
    arrange(desc(type)) |>
    ggplot(aes(logCPM,logFC,color=type)) +
    geom_point() +
    labs(title=lab) +
    scale_color_manual(values=c("TE"="red","gene"="gray"))
}

plot_de <- function(obj,lab=NULL) {
    mutate(obj,type = if_else(str_detect(feature,"ENSMUSG"),"gene","TE")) |>
    arrange(desc(type)) |>
    ggplot(aes(logFC,-log10(PValue),color=type)) +
    geom_point() +
    labs(title=lab)  +
    scale_color_manual(values=c("TE"="red","gene"="gray"))
}

RS 17 vs RS 1

summed.sub <- sce[,sce$label2 %in% c("17/RoundSpermatid", "1/RoundSpermatid") & sce$genotype == "MUT"]
summed.sub$label2 <- summed.sub$label2 |> factor() |> relevel(ref="1/RoundSpermatid")

between.res <- pseudoBulkDGE(summed.sub,
    label=rep("dummy", ncol(summed.sub)),
    design=~batch + label2,
    condition=summed.sub$label2,
    coef="label217/RoundSpermatid")[[1]]

between.res |> 
  as_tibble(rownames="feature") |>
  drop_na() |>
  plot_ma(lab="RS17 vs RS1")

between genotype for each celltype

de <- read_tsv("results/differential_expression/adult.tbl.pseudobulk.de.tsv.gz")
Rows: 49552 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): celltype, feature
dbl (5): logFC, logCPM, F, PValue, FDR

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

MA-plots

plot_ma(filter(de,celltype=="Spermatogonia"),lab = "Spermatogonia")

plot_ma(filter(de,celltype=="Spermatocyte"),lab = "Spermatocyte")

plot_ma(filter(de,celltype=="RoundSpermatid"),lab = "RoundSpermatid")

Volcanos

plot_de(filter(de,celltype=="Spermatogonia"),lab = "Spermatogonia")

plot_de(filter(de,celltype=="Spermatocyte"),lab = "Spermatocyte")

plot_de(filter(de,celltype=="RoundSpermatid"),lab = "RoundSpermatid")

de |>
  filter(celltype == "RoundSpermatid" & FDR < 0.05) |>
  arrange(-logFC) |>
  dplyr::select(feature,logFC,logCPM) |>
  head() |>
  gt::gt()
feature logFC logCPM
ENSMUSG00000028177 5.323075 0.20278125
ENSMUSG00000085126 4.949314 -0.04861464
ENSMUSG00000090487 4.460070 1.64918976
ENSMUSG00000072884 4.433038 -0.38806467
ENSMUSG00000091418 4.396233 0.23513199
ENSMUSG00000091718 4.329572 1.57183930
# confirming direction of LFCs make sense
makePerCellDF(germ_sce_exportable, features = "ENSMUSG00000090487") |>
  ggplot(aes(celltype,ENSMUSG00000090487,fill=genotype)) +
  geom_violin(scale = "width") +
  coord_cartesian(ylim=c(0,1))

GO term enrichment

go_enrichment <-filter(de,
                       celltype%in%c("Spermatogonia",
                                     "Spermatocyte",
                                     "RoundSpermatid")) %>%
  filter(str_detect(feature,"ENSMUSG")) %>%
  split(.,.$celltype) |>
  map(~dplyr::select(.x,feature,logFC)) |>
  map(deframe) |>
  map(sort,decreasing=T) |>
  map(~gseGO(.x,ont = "ALL",OrgDb=org.Mm.eg.db,keyType="ENSEMBL",
             maxGSSize =500, 
             minGSSize = 10,
             seed = 2022,
             nPermSimple = 10000,
             eps=0))
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.19% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in fgseaMultilevel(...): There were 28 pathways for which P-values were
not calculated properly due to unbalanced (positive and negative) gene-level
statistic values. For such pathways pval, padj, NES, log2err are set to NA. You
can try to increase the value of the argument nPermSimple (for example set it
nPermSimple = 100000)
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.17% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.35% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
go_df <- go_enrichment |>
  map_df(as_tibble,.id="celltype")
go_df |>
  filter(p.adjust < 0.05) |>
  group_by(celltype,sign(NES)) |>
  slice_max(-log10(pvalue),n=20) |>
  group_by(celltype) |>
  filter(n()>3) |>
  ungroup() |>
  mutate(Description=reorder_within(Description,sign(NES)*-log10(pvalue),within=celltype)) |>
  ggplot(aes(-log10(pvalue)*sign(NES),Description)) +
  geom_col() +
  facet_wrap(~celltype,scales="free",ncol=1) +
  scale_y_reordered() +
  xlab("signed log10(p)")

library("AnnotationDbi")


go_df |>
  filter(Description=="methylated histone binding") |>
  filter(celltype=="Spermatocyte") |>
  pull(core_enrichment) |>
  head(1) |>
  str_split("/") |>
  pluck(1) |>
  mapIds(org.Mm.eg.db,keys=_,column="SYMBOL", keytype="ENSEMBL", multiVals="first") |>
  walk(message)
'select()' returned 1:many mapping between keys and columns
Zmynd11
Cbx3
Rag2
Gm20918
Gm20737
Gm20825
Gm20807
Gm21394
Gm20873
Gm20795
Gm21118
Gm20806
Gm20773
Gm21310
Gm20854
Gm20816
Gm21943
Gm20815
Gm20738
Gm20865
Gm20830
Ssty2
Gm20914
de |> filter(feature == "ENSMUSG00000034610") # tut4
# A tibble: 4 × 7
  celltype       feature              logFC logCPM        F    PValue      FDR
  <chr>          <chr>                <dbl>  <dbl>    <dbl>     <dbl>    <dbl>
1 Myoid          ENSMUSG00000034610 -0.163    7.24  0.178   0.674     0.963   
2 RoundSpermatid ENSMUSG00000034610  0.849    4.78 17.3     0.0000322 0.000162
3 Spermatocyte   ENSMUSG00000034610  0.0107   5.79  0.00540 0.941     1.00    
4 Spermatogonia  ENSMUSG00000034610  0.0626   8.01  0.127   0.723     1.00    
de |> filter(feature == "ENSMUSG00000035248") # tut7
# A tibble: 4 × 7
  celltype       feature              logFC logCPM       F  PValue    FDR
  <chr>          <chr>                <dbl>  <dbl>   <dbl>   <dbl>  <dbl>
1 Myoid          ENSMUSG00000035248 -0.259    6.90 0.393   0.531   0.924 
2 RoundSpermatid ENSMUSG00000035248  0.628    4.13 7.16    0.00749 0.0210
3 Spermatocyte   ENSMUSG00000035248 -0.0556   5.44 0.126   0.722   1.00  
4 Spermatogonia  ENSMUSG00000035248  0.0125   4.82 0.00130 0.971   1.00  

DE TEs

de |>
  filter(FDR < 0.05) |>
  mutate(type= if_else(str_detect(feature,"ENSMUSG"),"gene","TE")) |>
  dplyr::count(celltype,type,direction=if_else(logFC > 0,"increased","decreased")) |>
  pivot_wider(names_from = "direction",values_from = "n",values_fill = 0) |>
  gt::gt(rowname_col = "celltype")
type increased decreased
Elongating gene 1 0
Myoid TE 0 1
Myoid gene 17 37
RoundSpermatid TE 52 20
RoundSpermatid gene 2339 3304
Spermatocyte TE 3 2
Spermatocyte gene 164 62
Spermatogonia TE 1 0
Spermatogonia gene 1 54
rs_de_tes <- de |> 
  filter(celltype=="RoundSpermatid") |>
  filter(!str_detect(feature,"ENSMUSG")) |>
  mutate(direction=case_when(FDR>=0.05~"ns",
                             logFC >0~"increased",
                             logFC <0~"decreased",
                             T~NA)) |>
  dplyr::select(celltype,feature,direction,logFC) 
rs_de_tes |>
  filter(direction!="ns") |>
  mutate(feature=fct_reorder(feature,logFC)) |>
  ggplot(aes(logFC,feature)) +
  geom_col() +
  facet_grid(~direction,scales = "free",space = "free")

DFAM hits

Note this is from mm10, but ref we’re using is GRCm39.

dfam_hits <- read_tsv("~/Downloads/mm10.nrph.hits.gz")
Rows: 4542408 Columns: 15
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr  (4): #seq_name, family_acc, family_name, strand
dbl (11): bits, e-value, bias, hmm-st, hmm-en, ali-st, ali-en, env-st, env-e...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dfh2 <- dfam_hits |>
  dplyr::select(chr=`#seq_name`,start=`ali-st`,end=`ali-en`,feature=family_name,kimura_div,seqlen=`sq-len`) |>
  right_join(rs_de_tes)
Joining with `by = join_by(feature)`
dfh2 |>
  ggplot(aes(direction,kimura_div)) +
  geom_boxplot()

dfh2 |>
  filter(str_detect(chr,"chrY|chrX")) |>
  mutate(chr2 = if_else(str_detect(chr,"chrY"),"chrY",'chrX')) |>
  ggplot(aes(direction,kimura_div)) +
  geom_boxplot() +
  facet_wrap(~chr2)

dfh2 |>
  #filter(direction == "increased") |>
  filter(!str_detect(chr,"_")) |>
  ggplot(aes(start,color=direction)) +
  geom_density() +
  facet_wrap(~chr,scales="free_x",ncol=1) +
  theme(axis.text.x = element_text(angle=45,hjust=1))

dfh2 |>
  filter(direction!="ns") |>
  group_by(feature,chr,direction,seqlen) |>
  tally() |>
  mutate(hits_per_mb = n/(seqlen/1e6)) |>
  ggplot(aes(feature,chr,fill=hits_per_mb)) +
  geom_tile() +
  facet_wrap(~direction,ncol = 2,scales="free_x") +
  theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5))